This report is a case study of the cryptocurrency market from 2013-2018.
We analyze different factors that influence the price of various cryptocurrencies.
In the end we try to implement a regression model to predict the price of a coin, based on it's previous values.
import pandas as pd
df = pd.read_csv(
filepath_or_buffer = '../data/all_currencies_table.csv',
index_col = 0
)
trend = pd.read_csv('../data/cryptocurrency_prices_by_date.csv')
def rip(l: str):
for i in '()[]./+- ':
l = l.replace(i, '')
return l.lower()
l1 = list(df.name.apply(rip))
l1.sort()
l2 = list(trend.currency.apply(rip).unique())
l2.sort()
t1 = [i for i in l1 if i not in l2]
t2 = [i for i in l2 if i not in l1]
modified_replace = {i: j for i in t1 for j in t2 if i.startswith(j) or j.startswith(i)}
modified_replace.update(
{
'adex': 'adxnet',
'ambrosus': 'amber',
'ammoreloaded': 'ammorewards',
'atmchain': 'attentiontokenofmedia',
'crypto20': 'c20',
'cryptobullion': 'cryptogenicbullion',
'ebitcoin': 'ebtcnew',
'escroco': 'escoro',
'farstcoin': 'firstbitcoincapital',
'futurxe': 'futurexe',
'g3n': 'genstake',
'gaymoney': 'gaycoin',
'kickcoin': 'kickico',
'lbrycredits': 'librarycredit',
"miners'rewar": 'minersrewardtoken',
'monoeci': 'monacocoin',
'russianminer': 'russianminingcoin',
'spectreaidi': 'spectredividend',
'spectreaiut': 'spectreutility',
'unitedtrader': 'uttoken',
'wetrust': 'trust',
'zlancer': 'zcashgold'
}
)
df['name'] = df.name.apply(rip)
df.replace(modified_replace, inplace = True)
l1 = list(df.name.apply(rip))
l1.sort()
l2 = list(trend.currency.apply(rip).unique())
l2.sort()
t1 = [i for i in l1 if i not in l2]
t2 = [i for i in l2 if i not in l1]
remove = [i for i in t1 if i not in t2]
display(d := df[df.duplicated(subset = 'name', keep = False)])
| name | symbol | market_cap | price | circulating_supply | volume_24hr | 1h | 24h | 7d | |
|---|---|---|---|---|---|---|---|---|---|
| number | |||||||||
| 83 | enigma | ENG | 2.421616e+08 | 3.235890 | 7.483617e+07 | 2.160550e+07 | -3.12 | 36.52 | 19.37 |
| 242 | hempcoin | THC | 4.174616e+07 | 0.181128 | 2.304788e+08 | 3.773910e+05 | -1.27 | 18.91 | 0.50 |
| 493 | encryptotel | ETT | 7.652512e+06 | 0.123167 | 6.213119e+07 | 8.636260e+02 | -1.53 | 56.02 | 38.85 |
| 860 | hempcoin | HMP | 2.403202e+05 | 0.000177 | 1.356645e+09 | 5.746430e+02 | -1.00 | 11.42 | 105.11 |
| 1054 | enigma | XNG | 1.979928e+05 | 0.343037 | 5.771763e+05 | 2.020970e+02 | -0.99 | 18.05 | 2.58 |
| 1325 | firstbitcoin | BIT | NaN | 0.041009 | NaN | 7.879950e+03 | -1.84 | 9.63 | 111.89 |
| 1383 | firstbitcoin | BITCF | NaN | 0.240117 | NaN | 1.186050e+03 | -0.99 | -32.45 | 8.73 |
| 1389 | encryptotel | ETT | NaN | 0.079232 | NaN | 9.495290e+02 | -1.23 | 6.45 | -2.17 |
From the above 8 data points, we know that we'll be getting rid of the below 3.
Lets try finding whether the other ones have any more info related to symbol etc.
display(d := list(d.name)[:2])
['enigma', 'hempcoin']
for i in d:
for j in l2:
if i in j:
print(i, j)
enigma enigma enigma enigmaproject hempcoin hempcoin hempcoin hempcoinhmp
In hempcoin, we can clearly makeout that the 2nd one maps to hempcoinhmp through the symbol.
In enigma, we'll have to make an assumption that the 2nd one maps to hempcoinproject.
d_replace = {
'enigma': 'enigmaproject',
'hempcoin': 'hempcoinhmp'
}
pd.to_pickle([modified_replace, remove, d_replace], filepath_or_buffer = '../Data/Replacements.pkl')
# Importing necessary libraries
import pandas as pd
import missingno as msno
import matplotlib.pyplot as plt
plt.style.use('dark_background')
df = pd.read_csv(
filepath_or_buffer = '../data/all_currencies_table.csv',
index_col = 0
)
df.head()
| name | symbol | market_cap | price | circulating_supply | volume_24hr | 1h | 24h | 7d | |
|---|---|---|---|---|---|---|---|---|---|
| number | |||||||||
| 1 | Bitcoin | BTC | 1.507030e+11 | 8940.740000 | 1.685576e+07 | 8.103300e+09 | -0.63 | 11.22 | 3.58 |
| 2 | Ethereum | ETH | 8.754697e+10 | 897.729000 | 9.752049e+07 | 3.062570e+09 | -0.41 | 11.99 | -0.25 |
| 3 | Ripple | XRP | 4.062771e+10 | 1.041490 | 3.900922e+10 | 2.504810e+09 | -0.37 | 32.86 | 23.94 |
| 4 | Bitcoin Cash | BCH | 2.294639e+10 | 1353.040000 | 1.695914e+07 | 1.149550e+09 | 0.11 | 7.89 | 18.32 |
| 5 | Cardano | ADA | 1.163729e+10 | 0.448847 | 2.592707e+10 | 7.603720e+08 | -0.71 | 27.62 | 23.61 |
df.tail()
| name | symbol | market_cap | price | circulating_supply | volume_24hr | 1h | 24h | 7d | |
|---|---|---|---|---|---|---|---|---|---|
| number | |||||||||
| 1512 | LiteCoin Gold | LTG | NaN | 0.006132 | NaN | NaN | NaN | 8.37 | -23.59 |
| 1513 | eBIT | EBIT | NaN | 0.004195 | NaN | NaN | NaN | NaN | -38.75 |
| 1514 | Faceblock | FBL | NaN | 0.003230 | NaN | NaN | NaN | NaN | -79.33 |
| 1515 | Farstcoin | FRCT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1516 | Decentralized... | DUBI | NaN | 123.813000 | NaN | NaN | NaN | 10.87 | 11.25 |
From the above dataset, we can tell that name could potentially act as the index of the dataset.
To do the same, we need to make a few adjustments as seen from the mapping notebook.
p = pd.read_pickle(filepath_or_buffer = '../Data/Replacements.pkl')
def rip(l: str):
for i in '()[]./+- ':
l = l.replace(i, '')
return l.lower()
df['name'] = df.name.apply(rip)
df['name'].replace(p[0], inplace = True)
d = df[df.duplicated(subset = 'name')].copy()
d
| name | symbol | market_cap | price | circulating_supply | volume_24hr | 1h | 24h | 7d | |
|---|---|---|---|---|---|---|---|---|---|
| number | |||||||||
| 860 | hempcoin | HMP | 240320.248491 | 0.000177 | 1.356645e+09 | 574.643 | -1.00 | 11.42 | 105.11 |
| 1054 | enigma | XNG | 197992.821499 | 0.343037 | 5.771763e+05 | 202.097 | -0.99 | 18.05 | 2.58 |
| 1383 | firstbitcoin | BITCF | NaN | 0.240117 | NaN | 1186.050 | -0.99 | -32.45 | 8.73 |
| 1389 | encryptotel | ETT | NaN | 0.079232 | NaN | 949.529 | -1.23 | 6.45 | -2.17 |
df.drop_duplicates(subset = 'name', keep = 'last', inplace = True)
d.name.replace(p[2], inplace = True)
d.dropna(inplace = True)
d
| name | symbol | market_cap | price | circulating_supply | volume_24hr | 1h | 24h | 7d | |
|---|---|---|---|---|---|---|---|---|---|
| number | |||||||||
| 860 | hempcoinhmp | HMP | 240320.248491 | 0.000177 | 1.356645e+09 | 574.643 | -1.00 | 11.42 | 105.11 |
| 1054 | enigmaproject | XNG | 197992.821499 | 0.343037 | 5.771763e+05 | 202.097 | -0.99 | 18.05 | 2.58 |
Now, we can append the same back to the original dataset.
df = df.append(d)
df = df[df.name.apply(lambda l: l not in p[1])]
Now that the necessary changes to name have been done, we can set it as the index.
df.set_index(
keys = ['name', 'symbol'],
inplace = True
)
df.info()
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1501 entries, ('bitcoin', 'BTC') to ('enigmaproject', 'XNG')
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 market_cap 1142 non-null float64
1 price 1499 non-null float64
2 circulating_supply 1119 non-null float64
3 volume_24hr 1487 non-null float64
4 1h 1437 non-null float64
5 24h 1450 non-null float64
6 7d 1463 non-null float64
dtypes: float64(7)
memory usage: 175.8+ KB
df.shape
(1501, 7)
df.apply(
func = lambda l: l.isna().sum(),
axis = 1
).value_counts()
0 1061 2 328 1 56 3 35 4 16 6 2 5 2 7 1 dtype: int64
We can choose to drop data with more than 2 missing values and apply filler techniques for the rest.
fig = msno.bar(
df,
color = 'green'
)
plt.show()
Leaving market_cap and circulating_supply, the number of missing values in the other features is quite low.
ax = msno.matrix(
df,
color = (0, 0, 0.6),
sparkline = False
)
plt.show()
Here, we can see that there is high correspondence of nullity between market_cap and circulating_supply.
For now lets fill all the other features with their respective means.
df[df.price.isna()]
| market_cap | price | circulating_supply | volume_24hr | 1h | 24h | 7d | ||
|---|---|---|---|---|---|---|---|---|
| name | symbol | |||||||
| lltoken | LLT | NaN | NaN | NaN | 0.0 | NaN | NaN | NaN |
| firstbitcoincapital | FRCT | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
For now we can ignore the price feature as the 2 data points will eventually be dropped eitherways.
def filler(l):
mean = l.mean()
l.fillna(mean, inplace = True)
return l
names = df.columns[3:]
print(names)
df[names] = df[names].apply(func = filler)
Index(['volume_24hr', '1h', '24h', '7d'], dtype='object')
df.tail()
| market_cap | price | circulating_supply | volume_24hr | 1h | 24h | 7d | ||
|---|---|---|---|---|---|---|---|---|
| name | symbol | |||||||
| faceblock | FBL | NaN | 0.003230 | NaN | 1.669866e+07 | -0.71247 | 13.828138 | -79.330000 |
| firstbitcoincapital | FRCT | NaN | NaN | NaN | 1.669866e+07 | -0.71247 | 13.828138 | 4.981012 |
| decentralizeduniversalbasicincome | DUBI | NaN | 123.813000 | NaN | 1.669866e+07 | -0.71247 | 10.870000 | 11.250000 |
| hempcoinhmp | HMP | 240320.248491 | 0.000177 | 1.356645e+09 | 5.746430e+02 | -1.00000 | 11.420000 | 105.110000 |
| enigmaproject | XNG | 197992.821499 | 0.343037 | 5.771763e+05 | 2.020970e+02 | -0.99000 | 18.050000 | 2.580000 |
Now that NaNs are dealt with for the other features, we can look into circulating_supply and market_cap.
Both these features are related through an equation that goes like -
$Market Cap = Current Price * Circulating Supply.$
It can be used to fill the NaNs in circulating_supply.
Next, we can drop the common null data points between the two.
df['circulating_supply'] = df.apply(
func = lambda l: l[0] / l[1] if pd.isna(l[2]) else l[2],
axis = 1
)
df.dropna(inplace = True)
df.shape
(1142, 7)
We've gotten rid of 360 data points and applied filler methods for the remaining NaNs.
Lets take a look at the bar chart to see if they're any more remaining.
fig = msno.bar(
df,
color = 'green'
)
plt.show()
Now that we've gotten rid of all the NaNs, We can look to rename our features for ease of access during EDA.
df.rename(
columns = {
'market_cap': 'MarketCap',
'price': 'Price',
'circulating_supply': 'Circulate',
'volume_24hr': 'Volume',
'1h': 'Hourly',
'24h': 'Daily',
'7d': 'Weekly'
},
inplace = True
)
df.index.rename(names = ['Name', 'Symbol'], inplace = True)
df.head()
| MarketCap | Price | Circulate | Volume | Hourly | Daily | Weekly | ||
|---|---|---|---|---|---|---|---|---|
| Name | Symbol | |||||||
| bitcoin | BTC | 1.507030e+11 | 8940.740000 | 1.685576e+07 | 8.103300e+09 | -0.63 | 11.22 | 3.58 |
| ethereum | ETH | 8.754697e+10 | 897.729000 | 9.752049e+07 | 3.062570e+09 | -0.41 | 11.99 | -0.25 |
| ripple | XRP | 4.062771e+10 | 1.041490 | 3.900922e+10 | 2.504810e+09 | -0.37 | 32.86 | 23.94 |
| bitcoincash | BCH | 2.294639e+10 | 1353.040000 | 1.695914e+07 | 1.149550e+09 | 0.11 | 7.89 | 18.32 |
| cardano | ADA | 1.163729e+10 | 0.448847 | 2.592707e+10 | 7.603720e+08 | -0.71 | 27.62 | 23.61 |
df.to_csv(
path_or_buf = '../data/all_currencies_clean_table.csv'
)
trend = pd.read_csv(
filepath_or_buffer = '../data/cryptocurrency_prices_by_date.csv'
)
trend.head()
| currency | date | price | |
|---|---|---|---|
| 0 | 0x | 1502892561000 | 0.111725 |
| 1 | 0x | 1502982305000 | 0.211486 |
| 2 | 0x | 1503068692000 | 0.283789 |
| 3 | 0x | 1503155108000 | 0.511434 |
| 4 | 0x | 1503241503000 | 0.429522 |
For this dataset, we could start off by bringing the date back to a usual format.
trend['date'] = pd.to_datetime(
arg = trend.date,
unit = 'ms'
).dt.strftime('%d-%m-%Y %H:%M:%S')
trend.tail()
| currency | date | price | |
|---|---|---|---|
| 657311 | zurcoin | 04-02-2018 07:09:10 | 0.003254 |
| 657312 | zurcoin | 05-02-2018 07:09:10 | 0.002774 |
| 657313 | zurcoin | 06-02-2018 07:44:10 | 0.001986 |
| 657314 | zurcoin | 09-02-2018 08:29:06 | 0.002684 |
| 657315 | zurcoin | 10-02-2018 03:54:06 | 0.002325 |
Next up, lets rename our features for ease of access.
trend.rename(
columns = {
'currency' : 'Coin',
'date': 'Date',
'price': 'Price'
},
inplace = True
)
trend.head()
| Coin | Date | Price | |
|---|---|---|---|
| 0 | 0x | 16-08-2017 14:09:21 | 0.111725 |
| 1 | 0x | 17-08-2017 15:05:05 | 0.211486 |
| 2 | 0x | 18-08-2017 15:04:52 | 0.283789 |
| 3 | 0x | 19-08-2017 15:05:08 | 0.511434 |
| 4 | 0x | 20-08-2017 15:05:03 | 0.429522 |
We need to make some changes to the coin names.
trend['Coin'] = trend.Coin.apply(rip)
Lastly, we can drop data that doesn't have corresponding data back in the first dataset.
index = list(df.index.get_level_values('Name'))
trend = trend[trend.Coin.apply(lambda l: l in index)]
trend.reset_index(
drop = True,
inplace = True
)
Even though this dataset is quite long, there seems to be no better way to organize the same.
trend.to_csv(
path_or_buf = '../Data/clean_cryptocurrency_prices_by_date.csv'
)
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default = 'plotly_dark'
# pio.renderers.default = 'png'
def custom(t, x, y):
fig.update_layout(
title = t,
xaxis_title = x,
yaxis_title = y,
font_color = 'orange',
title_font_color = '#00fe35'
)
df = pd.read_csv(
filepath_or_buffer = '../Data/all_currencies_clean_table.csv',
index_col = ['Name', 'Symbol']
)
name = pd.Series(df.index.get_level_values(0))
df.head()
| MarketCap | Price | Circulate | Volume | Hourly | Daily | Weekly | ||
|---|---|---|---|---|---|---|---|---|
| Name | Symbol | |||||||
| bitcoin | BTC | 1.507030e+11 | 8940.740000 | 1.685576e+07 | 8.103300e+09 | -0.63 | 11.22 | 3.58 |
| ethereum | ETH | 8.754697e+10 | 897.729000 | 9.752049e+07 | 3.062570e+09 | -0.41 | 11.99 | -0.25 |
| ripple | XRP | 4.062771e+10 | 1.041490 | 3.900922e+10 | 2.504810e+09 | -0.37 | 32.86 | 23.94 |
| bitcoincash | BCH | 2.294639e+10 | 1353.040000 | 1.695914e+07 | 1.149550e+09 | 0.11 | 7.89 | 18.32 |
| cardano | ADA | 1.163729e+10 | 0.448847 | 2.592707e+10 | 7.603720e+08 | -0.71 | 27.62 | 23.61 |
df.info()
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1142 entries, ('bitcoin', 'BTC') to ('enigmaproject', 'XNG')
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 MarketCap 1142 non-null float64
1 Price 1142 non-null float64
2 Circulate 1142 non-null float64
3 Volume 1142 non-null float64
4 Hourly 1142 non-null float64
5 Daily 1142 non-null float64
6 Weekly 1142 non-null float64
dtypes: float64(7)
memory usage: 149.3+ KB
In the above dataset, we can see data about 1142 different cryptocurrencies at a static point in time (i.e. 02/09/2018 12:10pm).
Which makes all the datapoints unrelated except for the fact that all of them represent a certain cryptocurrency.
Thus one could say that all this data picked feature-wise would be discrete data.
tr = pd.read_csv(
filepath_or_buffer = '../Data/clean_cryptocurrency_prices_by_date.csv',
index_col = 0,
parse_dates = [2],
infer_datetime_format = '%d-%m-%Y %H:%M:%S',
dayfirst = True
)
tr.head()
| Coin | Date | Price | |
|---|---|---|---|
| 0 | 0x | 2017-08-16 14:09:21 | 0.111725 |
| 1 | 0x | 2017-08-17 15:05:05 | 0.211486 |
| 2 | 0x | 2017-08-18 15:04:52 | 0.283789 |
| 3 | 0x | 2017-08-19 15:05:08 | 0.511434 |
| 4 | 0x | 2017-08-20 15:05:03 | 0.429522 |
tr.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 570927 entries, 0 to 570926 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Coin 570927 non-null object 1 Date 570927 non-null datetime64[ns] 2 Price 570927 non-null float64 dtypes: datetime64[ns](1), float64(1), object(1) memory usage: 17.4+ MB
tr.head()
| Coin | Date | Price | |
|---|---|---|---|
| 0 | 0x | 2017-08-16 14:09:21 | 0.111725 |
| 1 | 0x | 2017-08-17 15:05:05 | 0.211486 |
| 2 | 0x | 2017-08-18 15:04:52 | 0.283789 |
| 3 | 0x | 2017-08-19 15:05:08 | 0.511434 |
| 4 | 0x | 2017-08-20 15:05:03 | 0.429522 |
In this dataset, we can see how price changes over a certain period of time where the cryptocurrency exists.
Our time period of interest being 2014-2018 as that is when the cryptocurrency market boomed.
We select features from which we may have to remove outliers to continue our analysis efficiently.
We only select the features that are not spanning over a period of time and ignore the ones like daily trend as they are very volatile.
fig = px.violin(
data_frame = df,
x = 'MarketCap',
hover_data = df,
hover_name = name
)
custom(
x = 'Market Capitalization',
y = 'Density of Cryptocurrencies',
t = 'Violin Plot of Market Capitalization for all the Cryptocurrencies'
)
fig.update_xaxes(
rangeslider_visible = True,
rangeslider_bgcolor = '#202020',
range = [-35e8, 12e9]
)
fig.show()
In the above Violin plot, we can autoscale to see that there are 4 outliers namely Bitcoin, Ethereum, Ripple and Bitcoincash.
We observe a relatively high Market Capitalization as these cryptocurrencies were the pioneers in the cryptocurrency market.
outliers_market = ['bitcoin', 'ethereum', 'ripple', 'bitcoincash']
fig = px.line(
data_frame = tr[tr.Coin.apply(lambda l: l in outliers_market)],
x = 'Date',
y = 'Price',
color = 'Coin'
)
custom(
t = 'Price Change of the top 4 Cryptocurrencies (by Market Capitalization)',
x = 'Date',
y = 'Price (in USD)'
)
fig.update_xaxes(
rangeslider_visible = True,
rangeslider_bgcolor = '#202020'
)
fig.show()
fig = px.violin(
data_frame = df,
x = 'Price',
hover_data = df,
hover_name = name
)
custom(
t = 'Violin Plot of the Prices of all the Cryptocurrencies',
x = 'Price (in USD)',
y = 'Density of Cryptocurrencies'
)
fig.update_xaxes(
rangeslider_visible = True,
rangeslider_bgcolor = '#202020',
range = [-2e4, 7e4]
)
fig.show()
In the above Violin plot, we can again autoscale to see that there are 3 outliers namely Bit20, Projectx and 42coin.
These cryptocurrencies are relatively more expensive than all the other ones in the market.
outliers_price = ['bit20', 'projectx', '42coin']
df.loc[outliers_price]
| MarketCap | Price | Circulate | Volume | Hourly | Daily | Weekly | ||
|---|---|---|---|---|---|---|---|---|
| Name | Symbol | |||||||
| bit20 | BTWTY | 9.596181e+05 | 944506.0 | 1.016000 | 666.156 | 0.32 | 17.33 | 3.74 |
| projectx | NANOX | 1.893801e+04 | 241976.0 | 0.078264 | 1541.280 | -0.99 | 12.34 | -18.31 |
| 42coin | 42 | 2.990944e+06 | 71213.0 | 41.999971 | 5076.870 | -1.00 | 16.36 | 2.75 |
fig = px.line(
data_frame = tr[tr.Coin.apply(lambda l: l in outliers_price)],
x = 'Date',
y = 'Price',
color = 'Coin'
)
custom(
t = 'Price Change of the top 3 Cryptocurrencies (by Price)',
x = 'Date',
y = 'Price (in USD)'
)
fig.update_xaxes(
rangeslider_visible = True,
rangeslider_bgcolor = '#202020'
)
fig.show()
fig = px.violin(
data_frame = df,
x = 'Circulate',
hover_data = df,
hover_name = name
)
custom(
t = 'Violin Plot of the Circulating Supply of all the Cryptocurrencies',
x = 'Circulating Supply',
y = 'Density of Cryptocurrencies'
)
fig.update_xaxes(
rangeslider_visible = True,
rangeslider_bgcolor = '#202020',
range = [-60e9, 250e9]
)
fig.show()
Finally, after autoscaling the above Violin plot, we can observe that there are 5 outliers namely Sprouts, Paccoin, Kin, Dimecoin and Fedoracoin.
outliers_circulate = ['sprouts', 'paccoin', 'kin', 'dimecoin', 'fedoracoin']
df.loc[outliers_circulate]
| MarketCap | Price | Circulate | Volume | Hourly | Daily | Weekly | ||
|---|---|---|---|---|---|---|---|---|
| Name | Symbol | |||||||
| sprouts | SPRTS | 1.020791e+07 | 0.000004 | 2.704104e+12 | 19672.60 | 3.14 | 20.04 | 9.78 |
| paccoin | PAC | 9.483727e+07 | 0.000037 | 2.568974e+12 | 547309.00 | 2.23 | 62.56 | -9.69 |
| kin | KIN | 2.229898e+08 | 0.000295 | 7.560976e+11 | 283177.00 | -4.38 | 8.88 | -6.46 |
| dimecoin | DIME | 3.223906e+07 | 0.000060 | 5.390967e+11 | 85035.20 | 2.21 | 10.72 | 0.49 |
| fedoracoin | TIPS | 7.491492e+06 | 0.000017 | 4.431682e+11 | 4968.61 | -1.90 | 2.20 | -18.48 |
fig = px.line(
data_frame = tr[tr.Coin.apply(lambda l: l in outliers_circulate)],
x = 'Date',
y = 'Price',
color = 'Coin'
)
custom(
t = 'Price Change of the top 5 Cryptocurrencies (by Circulating Supply)',
x = 'Date',
y = 'Price (in USD)'
)
fig.update_xaxes(
rangeslider_visible = True,
rangeslider_bgcolor = '#202020'
)
fig.show()
import pandas as pd
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
pio.templates.default = 'plotly_dark'
# pio.renderers.default = 'png'
def custom(t, x, y):
fig.update_layout(
title = t,
xaxis_title = x,
yaxis_title = y,
font_color = '#ff8c00',
title_font_color = '#00fe35'
)
df = pd.read_csv(
filepath_or_buffer = '../Data/all_currencies_clean_table.csv',
index_col = ['Name', 'Symbol']
)
df.head()
| MarketCap | Price | Circulate | Volume | Hourly | Daily | Weekly | ||
|---|---|---|---|---|---|---|---|---|
| Name | Symbol | |||||||
| bitcoin | BTC | 1.507030e+11 | 8940.740000 | 1.685576e+07 | 8.103300e+09 | -0.63 | 11.22 | 3.58 |
| ethereum | ETH | 8.754697e+10 | 897.729000 | 9.752049e+07 | 3.062570e+09 | -0.41 | 11.99 | -0.25 |
| ripple | XRP | 4.062771e+10 | 1.041490 | 3.900922e+10 | 2.504810e+09 | -0.37 | 32.86 | 23.94 |
| bitcoincash | BCH | 2.294639e+10 | 1353.040000 | 1.695914e+07 | 1.149550e+09 | 0.11 | 7.89 | 18.32 |
| cardano | ADA | 1.163729e+10 | 0.448847 | 2.592707e+10 | 7.603720e+08 | -0.71 | 27.62 | 23.61 |
df.info()
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1142 entries, ('bitcoin', 'BTC') to ('enigmaproject', 'XNG')
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 MarketCap 1142 non-null float64
1 Price 1142 non-null float64
2 Circulate 1142 non-null float64
3 Volume 1142 non-null float64
4 Hourly 1142 non-null float64
5 Daily 1142 non-null float64
6 Weekly 1142 non-null float64
dtypes: float64(7)
memory usage: 149.3+ KB
In the above dataset, we can see data about 1142 different cryptocurrencies at a static point in time (i.e. 02/09/2018 12:10pm).
Which makes all the datapoints unrelated except for the fact that all of them represent a certain cryptocurrency.
Thus one could say that all this data picked feature-wise would be discrete data.
tr = pd.read_csv(
filepath_or_buffer = '../Data/clean_cryptocurrency_prices_by_date.csv',
index_col = 0,
parse_dates = [2],
infer_datetime_format = '%d-%m-%Y %H:%M:%S',
dayfirst = True
)
tr.head()
| Coin | Date | Price | |
|---|---|---|---|
| 0 | 0x | 2017-08-16 14:09:21 | 0.111725 |
| 1 | 0x | 2017-08-17 15:05:05 | 0.211486 |
| 2 | 0x | 2017-08-18 15:04:52 | 0.283789 |
| 3 | 0x | 2017-08-19 15:05:08 | 0.511434 |
| 4 | 0x | 2017-08-20 15:05:03 | 0.429522 |
tr.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 570927 entries, 0 to 570926 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Coin 570927 non-null object 1 Date 570927 non-null datetime64[ns] 2 Price 570927 non-null float64 dtypes: datetime64[ns](1), float64(1), object(1) memory usage: 17.4+ MB
tr.head()
| Coin | Date | Price | |
|---|---|---|---|
| 0 | 0x | 2017-08-16 14:09:21 | 0.111725 |
| 1 | 0x | 2017-08-17 15:05:05 | 0.211486 |
| 2 | 0x | 2017-08-18 15:04:52 | 0.283789 |
| 3 | 0x | 2017-08-19 15:05:08 | 0.511434 |
| 4 | 0x | 2017-08-20 15:05:03 | 0.429522 |
In this dataset, we can see how price changes over a certain period of time where the cryptocurrency exists.
Our time period of interest being 2014-2018 as that is when the cryptocurrency market boomed.
outliers = ['bitcoin', 'ethereum', 'ripple', 'bitcoincash', 'bit20', 'projectx', '42coin', 'sprouts', 'paccoin', 'kin', 'dimecoin', 'fedoracoin', 'unityingot', 'ecoin']
These are all the observed outliers based on the first three features.
And two observational ones from the trend visualizations which are unityingot and ecoin because of their magnanimous change percentages.
df = df.loc[list(filter(lambda l: l not in outliers, df.index.get_level_values(0)))]
name = pd.Series(df.index.get_level_values(0))
df.head()
| MarketCap | Price | Circulate | Volume | Hourly | Daily | Weekly | ||
|---|---|---|---|---|---|---|---|---|
| Name | Symbol | |||||||
| cardano | ADA | 1.163729e+10 | 0.448847 | 2.592707e+10 | 760372000.0 | -0.71 | 27.62 | 23.61 |
| litecoin | LTC | 9.109557e+09 | 165.179000 | 5.514961e+07 | 769065000.0 | -1.62 | 13.37 | 28.81 |
| stellar | XLM | 8.047186e+09 | 0.436583 | 1.843220e+10 | 212032000.0 | 0.05 | 21.88 | 14.83 |
| neo | NEO | 7.740330e+09 | 119.082000 | 6.500000e+07 | 304126000.0 | -1.08 | 6.72 | 2.78 |
| eos | EOS | 6.547181e+09 | 9.929270 | 6.593819e+08 | 606048000.0 | -1.05 | 19.34 | 6.47 |
fig = px.violin(
data_frame = df,
x = 'MarketCap',
hover_data = df,
hover_name = name
)
custom(
x = 'Market Capitalization',
y = 'Density of Cryptocurrencies',
t = 'Violin Plot of Market Capitalization for all the Cryptocurrencies'
)
fig.update_xaxes(
rangeslider_visible = True,
rangeslider_bgcolor = '#202020',
range = [-3e8, 2e9]
)
fig.show()
fig = px.violin(
data_frame = df,
x = 'Price',
hover_data = df,
hover_name = name
)
custom(
t = 'Violin Plot of the Prices of all the Cryptocurrencies',
x = 'Price (in USD)',
y = 'Density of Cryptocurrencies'
)
fig.update_xaxes(
rangeslider_visible = True,
rangeslider_bgcolor = '#202020',
range = [-3e2, 2e3]
)
fig.show()
fig = px.violin(
data_frame = df,
x = 'Circulate',
hover_data = df,
hover_name = name
)
custom(
t = 'Histogram of the Circulating Supply of all the Cryptocurrencies',
x = 'Circulating Supply',
y = 'Number of Cryptocurrencies'
)
fig.update_xaxes(
rangeslider_visible = True,
rangeslider_bgcolor = '#202020',
range = [-7e9, 38e9]
)
fig.show()
After the removal of the 12 outliers, our data looks quite similar even though a bit more spread out.
This is because there will always be values that easily dominate the rest because of the pareto principle.
trends = ['Hourly', 'Daily', 'Weekly']
fig = make_subplots(
cols = 3,
shared_yaxes = True,
horizontal_spacing = 0,
subplot_titles = trends
)
for i in range(3):
period = trends[i]
trend = df[period]
fig.add_bar(
x = trend,
y = name,
name = i,
orientation = 'h',
showlegend = False,
marker_color = trend.apply(lambda l: 'green' if l > 0 else 'red'),
row = 1,
col = i + 1
)
fig.update_layout(
height = 4000,
width = 1369,
title = 'Percentage Change in the Price of Coin',
font_color = '#ff8c00',
title_font_color = '#00fe35'
)
fig.show()
From this graph, we can see, that on an hourly basis, the market is very bearish, since we have more red bars than green.
However on a daily basis, the market is extremely bullish, since we have way more greens than reds and there is a lot of strength in the trend.
On a weekly basis, we have almost equal number of reds and greens, but the greens are way taller than the reds.
This implies that the market is following a slightly weak bullish trend.
def pieplot(name):
value = (df[name] > 0).value_counts()
return [value[True], value[False]]
fig = make_subplots(
cols = 3,
subplot_titles = ['Hourly', 'Daily', 'Weekly'],
specs = [[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]]
)
for i in range(3):
fig.add_pie(
values = pieplot(trends[i]),
labels = ['Profit', 'Loss'],
marker_colors = ['green', 'red'],
name = trends[i],
row = 1,
col = i + 1
)
fig.update_layout(
title = 'Profit-Loss Pie Plots',
font_color = '#ff8c00',
title_font_color = '#00fe35'
)
fig.show()
These pie charts shows us the collective nature of the entire market.
We can see the percentage of Cryptocurrencies gaining a profit or going into a loss.
volatility = name.apply(lambda l: tr[tr.Coin == l].Price.std())
df['Volatility'] = volatility.values
df.head()
| MarketCap | Price | Circulate | Volume | Hourly | Daily | Weekly | Volatility | ||
|---|---|---|---|---|---|---|---|---|---|
| Name | Symbol | ||||||||
| cardano | ADA | 1.163729e+10 | 0.448847 | 2.592707e+10 | 760372000.0 | -0.71 | 27.62 | 23.61 | 0.310058 |
| litecoin | LTC | 9.109557e+09 | 165.179000 | 5.514961e+07 | 769065000.0 | -1.62 | 13.37 | 28.81 | 45.002213 |
| stellar | XLM | 8.047186e+09 | 0.436583 | 1.843220e+10 | 212032000.0 | 0.05 | 21.88 | 14.83 | 0.100708 |
| neo | NEO | 7.740330e+09 | 119.082000 | 6.500000e+07 | 304126000.0 | -1.08 | 6.72 | 2.78 | 34.842500 |
| eos | EOS | 6.547181e+09 | 9.929270 | 6.593819e+08 | 606048000.0 | -1.05 | 19.34 | 6.47 | 4.345754 |
In the above code cell, we calculate the volatility of each cryptocurrency.
We calculate it by taking the standard deviation of the price over their respective active time period.
fig = px.violin(
data_frame = df,
x = 'Volatility',
hover_data = df,
hover_name = name
)
custom(
t = 'Violin Plot of the Volatility of all the Cryptocurrencies',
x = 'Volatility (in USD)',
y = 'Density of Cryptocurrencies'
)
fig.update_xaxes(
rangeslider_visible = True,
rangeslider_bgcolor = '#202020',
range = [-150, 500]
)
fig.show()
fig = px.imshow(
df.corr(),
color_continuous_scale = 'algae'
)
fig.update_layout(
title = 'Heatmap of the Correlation Matrix of our dataset',
xaxis_title = 'Features',
yaxis_title = 'Features',
font_color = '#ff8c00',
title_font_color = '#00fe35'
)
fig.show()
fig = px.scatter(
df,
x = 'Price',
y = 'Volatility',
hover_data = df,
hover_name = name,
trendline='ols'
)
custom(
t = 'Price vs Volatility analysis of all the Cryptocurrencies',
x = 'Price of Cryptocurrencies (in USD)',
y = 'Volatility of cryptocurrencies (in USD)'
)
fig.update_xaxes(range = [0, 1200])
fig.update_yaxes(range = [0, 500])
fig.show()
Here, as we can observe the pattern between the price and volatility, the majority of the data is linearly related with the representation roughtly showing (2y = x). But there are exceptions, mostly the outliers with price value greater than 4k which dont fit in to this linear pattern.There are many possible factors why volatility is proportional to price. Popularity is a highly possible reason. Cryptocurrency markets are open 24-7. Therefore there are people all across the globe trading cryptocurrencies at almost all times of the day, depending on their time-zones. So the more the popular a cryptocurrency is, the more it will be traded by people. Which means, we have more investors in the popular currencies( like BTC and ETH), making them more expensive.
fig = px.scatter(df, x = 'MarketCap', y = 'Volume', color_continuous_scale = 'algae')
custom(
t = 'Maket Cap. vs Volume of all the Cryptocurrencies',
x = 'Volume of Cryptocurrencies (in USD)',
y = 'Market Cap. of cryptocurrencies (in USD)'
)
fig.show()
In this graph of Market cap. and Price, we can observe the datapoints grouped towards the origin of the graph. on having a overall look including the outliers this looks like a linear pattern with very low slope. But if we observe the grouped data points the pattern is vsisble with a linear line going parallel to x-axis.
The reason of this pattern is that most of our cryptocurrencies lie between 0 and 0.5 billion and contains a varity of market caps ranging from some thousands to millions, thus showing a stright line parallel to x axis.
px.scatter(df, x = 'Daily', y = 'Weekly')
Daily and Weekly had a low, correlation. This is vissible in the scatter plot above. The only reason that we see a slight correlation is that, our data has the time period when cryptocurrencies became very popular, and the markets became very bullish. Hence both weekly trends and daily trends are bullish in nature, which explain the slight correlation.
1.Volatility and price of a crypto currency are highly correlated because both independently depend on the popularity of the coin.
2.Volume and Market Cap. have a high correlation, as the formula for Market Cap. is (Volume * Price).
3.The time stamp at which the data was collected is having a bearish performance on hourly rate, a very bullish performance in daily rate and almost balanced performance in the weekly rate.
4.Both weekly trends and daily trends are weakly correlated, because the overall market trend is bullish.
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv(
filepath_or_buffer = '../Data/clean_cryptocurrency_prices_by_date.csv',
index_col = 0,
parse_dates = [2],
infer_datetime_format = '%d-%m-%Y %H:%M:%S',
dayfirst = True
)
df.head()
| Coin | Date | Price | |
|---|---|---|---|
| 0 | 0x | 2017-08-16 14:09:21 | 0.111725 |
| 1 | 0x | 2017-08-17 15:05:05 | 0.211486 |
| 2 | 0x | 2017-08-18 15:04:52 | 0.283789 |
| 3 | 0x | 2017-08-19 15:05:08 | 0.511434 |
| 4 | 0x | 2017-08-20 15:05:03 | 0.429522 |
def price_predict(coin):
c = coin
assert coin in df.Coin.unique(), f'{coin} is not available . .'
coin = df[df.Coin == coin].copy()
coin.drop(columns = ['Coin'], inplace = True)
df.drop_duplicates(subset = 'Date', inplace = True, ignore_index = True)
coin.set_index('Date', inplace = True)
X, y = coin.shift(1).iloc[1:-1], coin.iloc[1:-1]
ml_model = LinearRegression()
ml_model.fit(X, y)
print('Coin - ', c)
print('Prediction - ', ml_model.predict([X.iloc[-1]])[0][0])
print('Actual Value - ', y.iloc[-1][0])
price_predict('ethereum')
Coin - ethereum Prediction - 799.3354414367104 Actual Value - 897.326
Pareto Distribution -> https://www.tuannguyen.tech/2020/08/discussion-pareto-distribution/
Volatility -> https://seic.com/sites/default/files/inline-files/SEI_Standard-Deviation_UK.pdf
Pandas -> https://pandas.pydata.org/docs/reference
Plotting -> https://plotly.com/python/
Scikit-learn -> https://scikit-learn.org/stable/